/*
Date: October 2025
Project: Income and Child Maltreatment: Evidence from a Discontinuity in Tax Benefits
Author: Katherine Rittenhouse
Purpose: This file links birth records to CPS records using pre-existing linkage.
Files in: Extract_10151_1_2021; Births_99_19; CPS_ID
Files out: births_refs_cw; Births_fkclient; cw_fkcons_fk_bid; CPS_ID_consistent; Births_fkconsistent
*/

clear all
set more off

import delim "Extract_10151_1_2021.txt",clear

drop if substr(record1_source,1,4)=="DSMF" | substr(record2_source,1,4)=="DSMF"

drop if record1_source==record2_source

gen fkclient_t = ""
replace fkclient_t = record1_id1 if record1_id1_name=="fkclient_t"
replace fkclient_t = record2_id1 if record2_id1_name=="fkclient_t"

gen bcstateid = ""
replace bcstateid = record1_id2 if record1_id2_name=="bcstateid"
replace bcstateid = record2_id2 if record2_id2_name=="bcstateid"

gen length = strlen(bcstateid)
replace bcstateid = substr(bcstateid,8,6) if length==13
replace bcstateid = "0" + bcstateid if length==5
replace bcstateid = "00" + bcstateid if length==4
replace bcstateid = "000" + bcstateid if length==3
replace bcstateid = "0000" + bcstateid if length==2
replace bcstateid = "00000" + bcstateid if length==1
replace bcstateid = "000000" + bcstateid if length==0


gen bclocalid = ""
replace bclocalid = record1_id1 if record1_id1_name=="bclocalid"
replace bclocalid = record2_id1 if record2_id1_name=="bclocalid"

gen lengthloc = strlen(bclocalid)
replace bclocalid = substr(bclocalid,8,6) if lengthloc==13
replace bclocalid = "0" + bclocalid if lengthloc==5
replace bclocalid = "00" + bclocalid if lengthloc==4
replace bclocalid = "000" + bclocalid if lengthloc==3
replace bclocalid = "0000" + bclocalid if lengthloc==2
replace bclocalid = "00000" + bclocalid if lengthloc==1
replace bclocalid = "000000" + bclocalid if lengthloc==0


gen birthyear = .
replace birthyear = record1_year if substr(record1_source,1,1)=="B"
replace birthyear = record2_year if substr(record2_source,1,1)=="B"

tostring birthyear,force replace
replace birthyear = "0000" if birthyear== "" | birthyear=="."
gen bid = birthyear + bcstateid + bclocalid 

*226 births with no bclocalid and no bcstateid 
drop if bclocalid=="000000"& bcstateid=="000000"

gen length2= strlen(bid)
tab length2
ren (bcstate bclocal) (stateid localid)

keep fkclient_t bid stateid localid match_prob 
duplicates drop 
drop if match_prob <0.8

*let each birth match to multiple fkclients. but force each fkclient to only match to one birth 

sort fkclient_t match_prob
drop if fkclient_t==""

*each fkclient only matches to one birth
*but each birth can match to multiple fkclients
by fkclient_t: keep if _n==_N
/*
(76,431 observations deleted) = 3% of observations 
*/
save "births_refs_cw.dta",replace
*/

*match to births 
use "Datasets/Births_99_19.dta",clear
drop _merge
*NOTE - almost all of the duplicate bids in births data are due to missing bcstate/bclocalid. 

duplicates drop bid, force
/*
Duplicates in terms of bid

(26,312 observations deleted) = 0.2% of bids
*/
compress
merge 1:m bid using "births_refs_cw.dta",gen(_mbirths)


drop if _mbirths==2

gen anyref = (_mbirths==3)

drop _mbirths

save "Births_fkclient.dta",replace

*** create consistent clientid for births matched to multiple fkclients

keep bid fkclient
duplicates tag bid,gen(tag)
sort bid fkclient
gen fkconsistent=""
replace fkconsistent=fkclient if tag==0
replace fkconsistent=fkclient+fkclient[_n+1] if tag == 1 & tag[_n+1]==1 & bid==bid[_n+1]
replace fkconsistent=fkclient+fkclient[_n+1] +fkclient[_n+2] if tag == 2 & tag[_n+1]==2 & tag[_n+2]==2  & bid==bid[_n+1] & bid==bid[_n+2]
replace fkconsistent=fkclient+fkclient[_n+1] +fkclient[_n+2]+fkclient[_n+3] if tag == 3 & tag[_n+1]==3 & tag[_n+2]==3 & tag[_n+3]==3 & bid==bid[_n+1] & bid==bid[_n+2] & bid==bid[_n+3]
replace fkconsistent=fkclient+fkclient[_n+1] +fkclient[_n+2]+fkclient[_n+3]+fkclient[_n+4] if tag == 4 & tag[_n+1]==4 & tag[_n+2]==4 & tag[_n+3]==4 & tag[_n+4]==4 & bid==bid[_n+1] & bid==bid[_n+2] & bid==bid[_n+3]& bid==bid[_n+4]

sort bid fkcons
replace fkcons = fkcons[_n+1] if bid==bid[_n+1] & fkcons==""
replace fkcons = fkcons[_n+2] if bid==bid[_n+2] & fkcons==""
replace fkcons = fkcons[_n+3] if bid==bid[_n+3] & fkcons==""

save "cw_fkcons_fk_bid.dta",replace
drop tag bid
drop if fkclient==""
merge 1:1 fkclient_t using "CPS_ID",gen(_m2)
drop if _m2==1
drop _m2 
replace fkconsistent=fkclient if fkconsistent==""
*need to collapse to single fkconsistent
collapse (max) alleg* indic* subst* (sum) num* (min) minage,by(fkconsistent)
save "CPS_ID_consistent",replace

use "Births_fkclient.dta",clear
duplicates tag bid,gen(tag)
sort bid fkclient
gen fkconsistent=""
replace fkconsistent=fkclient if tag==0
replace fkconsistent=fkclient+fkclient[_n+1] if tag == 1 & tag[_n+1]==1 & bid==bid[_n+1]
replace fkconsistent=fkclient+fkclient[_n+1] +fkclient[_n+2] if tag == 2 & tag[_n+1]==2 & tag[_n+2]==2  & bid==bid[_n+1] & bid==bid[_n+2]
replace fkconsistent=fkclient+fkclient[_n+1] +fkclient[_n+2]+fkclient[_n+3] if tag == 3 & tag[_n+1]==3 & tag[_n+2]==3 & tag[_n+3]==3 & bid==bid[_n+1] & bid==bid[_n+2] & bid==bid[_n+3]
replace fkconsistent=fkclient+fkclient[_n+1] +fkclient[_n+2]+fkclient[_n+3]+fkclient[_n+4] if tag == 4 & tag[_n+1]==4 & tag[_n+2]==4 & tag[_n+3]==4 & tag[_n+4]==4 & bid==bid[_n+1] & bid==bid[_n+2] & bid==bid[_n+3]& bid==bid[_n+4]

drop fkclient tag
sort bid fkconsistent
by bid: keep if _n==_N
merge m:1 fkconsistent using "CPS_ID_consistent",gen(_m2)

drop if _m2==2
save "Births_fkconsistent.dta",replace


